clear all
set more off
cap log close
***************************************************************************************************
* 
* Program: inaccurate_discrimination.do
* Last Modified: 7/12/2023
* Purpose:
*        Study inaccurate statistical discrimination in MTurk hiring task
* Sections:
*     0. Set Globals
*     1. Insheet and Clean Qualtrics Survey Output
*     2. Main Paper Tables and Figures
*     3. Appendix Tables and Figures
*     4. Additional Stats
* Files Used:
*     1. Qualtrics-Part1.csv & Qualtrics-Part2.csv (survey responses)
*     2. MTurk-US-Part1.csv & MTurk-India-Part1.csv (worker payments)
*     3. MTurk-US-Part2.csv & MTurk-India-Part2.csv (employer payments)
* Data Files Created:
*     1. cleaned_worker.dta, mturk_worker_US.dta, mturk_worker_India.dta, mturk_worker.dta 
*     2. cleaned_employer.dta, mturk_employer_US.dta, mturk_employer_India.dta, mturk_employer.dta 
*     3. inaccurate_discrimination.dta
* Tables & Figures Created:
*     1. table2.tex (Table 2: Wages & Productivities for Hiring Task 1)
*     2. table3.tex (Table 3: Beliefs about Productivity by Employee Characteristics)
*     3. table4.tex (Table 4: Effect of Information - (Difference-in-Differences by Hiring Task))
*     4. figureB1.png (Figure B1: Productivity Distributions by Group)
*     5. figureB2.png (Figure B2: Productivity Distributions by Group)
*     6. tableB1.tex (Table B1: Summary Statistics)
*     7. tableB2.tex (Table B2: Discrimination in Wages, by Employee Characteristics)
*     8. tableB3.tex (Table B3: In-Group Bias Test (Hiring Task 1))
*     9. tableB4.tex (Table B4: In-Group vs. Out-Group Beliefs about Productivity by Employee Char.)
*     10. tableB5_sumstats.tex & tableB5_diff.tex (Table B5: Effects of Large Incentives...)
*     11. tableB6.tex (Table B6: Beliefs about Productivity by Employee Characteristics, Trimmed)
*
* Programs Installed (commands to run for installation):
*     1. ssc install winsor2
*     2. ssc install texdoc
*
* To construct Tables 2, 3, B4, & B6, type: texdoc do "${do}inaccurate_discrimination.do"
* Tables B1 and B5 require some additional reshapping/formatting to produce the "_final.tex" files
*
****************************************************************************************************

****************************************************************************************************
* 0. Set Globals
****************************************************************************************************

* Set directories 
*  Replace "yourdirectory" the local directory in which you've downloaded the Replication Archive

global master_dir  "/Users/kareemhaggag/Dropbox/Inaccurate Statistical Discrimination/Replication Archive"

global clean     	"$master_dir/Data-Extract/"
global raw	        "$master_dir/Data-Raw/"
global tables		"$master_dir/Results/tables/"
global figures		"$master_dir/Results/figures/"
global log			"$master_dir/Results/log/"
global do           "$master_dir/Scripts/"

cd "${master_dir}"
log using "${log}inaccurate_discrimination.log", replace

***************************************************************************************************
*  1. Insheet and Clean Qualtrics Survey Output
***************************************************************************************************

***********************************************
* Worker Task (Trivia - Survey 1) survey output
***********************************************

* Insheet, rename, and label the Qualtrics output fields
insheet using "${raw}Qualtrics-Part1.csv", comma name clear
rename responseid id_worker
rename durationinseconds duration_worker
rename q53 color
rename q54 movie
rename q55 coffeetea
rename q56 age_worker
rename q57 gender_worker
rename q58 subject
rename q59 sport
label var duration_worker "Survey 1 Duration (Seconds)"
label var id_worker "ID of Worker"
label var color "Favorite Color"
label var movie "Favorite Movie"
label var subject "Favorite Subject"
label var sport "Favorite Sport"
label var coffeetea "Prefer Coffee or Tea"
label var age_worker "Age (Worker)"
label var gender_worker "Gender (Worker)"

* The correct answer happens to be the modal answer in all cases, so can flag it and assign score
* based on this variable. 
forval i=2/51{
    bysort q`i': gen answerfrequency_q`i'=_N 
    egen maxanswer_q`i'=max(answerfrequency_q`i')
    gen q`i'_correct=(maxanswer_q`i'==answerfrequency_q`i')
    drop maxanswer_q`i' answerfrequency_q`i'
}
egen score=rowtotal(q2_correct - q51_correct)
sum score, detail
drop q2-q51 q2_correct-q51_correct
label var score "Trivia score"

* Save the dataset and then insheet & merge with the MTurk payments 
* This drops the 5 people who didn't successfully submit the survey, leaving 599 respondents
* NOTE: MTurk Worker IDs (PII) have been manually removed from the MTurk payments files
save "${clean}cleaned_worker.dta", replace

insheet using "${raw}MTurk-US-Part1.csv", comma name clear
rename answersurveycode id_worker
gen country_worker="United States"
keep if assignmentstatus=="Approved"
keep id_worker country_worker
save "${clean}mturk_worker_US.dta", replace

insheet using "${raw}MTurk-India-Part1.csv", comma name clear 
rename answersurveycode id_worker
gen country_worker="India"
keep if assignmentstatus=="Approved"
keep id_worker country_worker
save "${clean}mturk_worker_India.dta", replace

append using "${clean}mturk_worker_US.dta"
label var country_worker "Country (Worker)"
sort id_worker
save "${clean}mturk_worker.dta", replace

use "${clean}cleaned_worker.dta", clear
merge 1:1 id_worker using "${clean}mturk_worker.dta"

* Drop bottom 1% (<229 seconds) and top 99% (>3274 seconds) of data based on duration 
* Drops 12 submissions, leaving 589 respondents
* This is to match the R code written by our RA (i.e. to construct the data shown to employers)
sum duration_worker, detail
drop if duration_worker<r(p1) | duration_worker>r(p99)
disp _N

* Drop remaining 3 surveys that can't be matched to MTurk records
tab _merge
disp _N
drop if _merge!=3
disp _N
drop _merge

* Construct an "id_profile" to link the "recorder" field below (the identifier to track which
* profile the Javascript showed to an Employer). This ID is just a running count of the workers
* after sorting by their Qualtrics-assigned response ID.
sort id_worker
gen id_profile=_n
save "${clean}cleaned_worker.dta", replace

*********************************************************
* Employer Task (Hiring - Survey 2)
*********************************************************

* Insheet, rename, and label the Qualtrics output fields
* NOTE: MTurk Worker IDs (PII) have been manually removed from the survey output
insheet using "${raw}Qualtrics-Part2.csv", comma name clear
rename responseid id_employer
rename duration duration_employer
rename q17 gender_employer
rename q18 age_employer
rename q21 education_employer
label var duration_employer "Survey 2 Duration (Seconds)"
label var id_employer "ID of Employer"
label var duration_employer "Survey Duration (Seconds)"
label var education_employer "Education Level (Employer)"
label var age_employer "Age (Employer)"
label var gender_employer "Gender (Employer)"

* Construct prediction measures (merge to one set of variables)
gen pred_female=q24_1 if q24_1!=.
replace pred_female=q42_1 if q24_1==.

gen pred_male=q24_14 if q24_14!=.
replace pred_male=q42_14 if q24_14==.

gen pred_us=q24_20 if q24_20!=.
replace pred_us=q42_20 if q24_20==.

gen pred_india=q24_3 if q24_3!=.
replace pred_india=q42_3 if q24_3==.

gen pred_young=q24_4 if q24_4!=.
replace pred_young=q42_4 if q24_4==.

gen pred_old=q24_5 if q24_5!=.
replace pred_old=q42_5 if q24_5==.

* Create variable for incentive condition
gen incentivized=1 if q42_1!=.
replace incentivized=0 if q24_1!=.
drop q24* q42* prediction_do

* Save the dataset and then insheet & merge with the MTurk payments 
* Drops the 6 people who didn't successfully submit the survey, leaving 587 employers
save "${clean}cleaned_employer.dta", replace

insheet using "${raw}MTurk-US-Part2.csv", comma name clear
rename answersurveycode id_employer
gen country_employer="United States"
keep if assignmentstatus=="Approved"
keep id_employer country_employer assignmentstatus
save "${clean}mturk_employer_US.dta", replace

insheet using "${raw}MTurk-India-Part2.csv", comma name clear 
rename answersurveycode id_employer
gen country_employer="India"
keep if assignmentstatus=="Approved"
keep id_employer country_employer assignmentstatus
save "${clean}mturk_employer_India.dta", replace

append using "${clean}mturk_employer_US.dta"
label var country_employer "Country (Employer)"
sort id_employer
save "${clean}mturk_employer.dta", replace

use "${clean}cleaned_employer.dta", clear
merge 1:1 id_employer using "${clean}mturk_employer.dta"
tab _merge
disp _N

* Assign country ID for one employer who separately emailed their code to the RA
replace _merge=3 if id_employer=="R_sze0cr7UEpU38Rz"
replace country_employer="United States" if id_employer=="R_sze0cr7UEpU38Rz"
drop if _merge!=3
disp _N
drop _merge

* Follow RA's rule of dropping the 7 employers who took under 5 minutes. Leaves 580 employers.
disp _N
drop if duration_employer<300
disp _N

* Rename the wage variables so that we can reshape later and keep association with profiles
rename _r wage1
local i=6
forval j=2/20 {
    local i=`i'+1 
    rename v`i' wage`j'
}
rename _q28 wage21
local i=38
forval j=22/30 {
    local i=`i'+1
    rename v`i' wage`j'
}

* Recorder is the field where the profile associated with each wage is stored
* Split this up so we can then reshape.
split recorder, p("|")
* There are 3 cases where it appears the Employer hit next without assigning a score. These are 
* cases where the order field is doubled (e.g. for R_331MPOZhr0c1Zbt there are two profiles 
* associated with order "5"). Since it's not clear which one was observed by the employer, we drop
* these cases. They are employers: R_331MPOZhr0c1Zbt, R_eer8ZzjQ00TZevv, R_uvKzkkeylN7pDkB
list id_employer recorder* if recorder32!=""
drop if recorder32!=""
drop recorder32 recorder display
drop recorder31
forvalues i=1/30 {
    split recorder`i', p("-") gen(recorder`i'_)
    drop recorder`i'_2 recorder`i'
    rename recorder`i'_1 recorder`i'
}
reshape long recorder wage, i(id_employer) j(order)
rename recorder id_profile
destring id_profile, replace
label var order "Order of Worker Profile"
label var wage "Wage WTP"
label var id_profile "ID of Profile"
sort id_profile
save "${clean}cleaned_employer.dta", replace

**********************************
* Merged Employer-Employee Dataset
**********************************

* Merge the profiles to the worker file
merge m:1 id_profile using "${clean}cleaned_worker.dta"
tab _m
disp _N
drop _m

* Create tags for summarizing employer- & worker-invariant variables
egen tag_worker = tag(id_worker)
egen tag_employer=tag(id_employer)

* Construct a few more binary variables for the analysis
foreach X in employer worker {
    gen female_`X'=1 if gender_`X'=="Female"
    replace female_`X'=0 if gender_`X'=="Male"
    gen india_`X'=1 if country_`X'=="India"
    replace india_`X'=0 if country_`X'=="United States"
    gen old_`X'=1 if age_`X'>33 & age_`X'!=.
    replace old_`X'=0 if age_`X'<=33 & age_`X'!=.
    
    label define female_`X' 1 "Female" 0 "Male"
    label define india_`X' 1 "India" 0 "US"
    label define old_`X' 1 "Over 33" 0 "Under 33"
    label values female_`X' female_`X'
    label values india_`X' india_`X'
    label values old_`X' old_`X'
    label var female_`X' "Female (Yes = 1)"
    label var india_`X' "From India (Yes = 1)"
    label var old_`X' "Over 33 (Yes = 1)"
}

* Tea/Coffee Preference
gen preftea_worker=1 if coffeetea=="Tea"
replace preftea_worker=0 if coffeetea=="Coffee"
label var preftea_worker "Prefer Tea (Yes = 1)"

* Favorite High School Subject (binarize by whether or not they mention Math)
tab subject, sort
gen math_fav = strpos(lower(subject), "math") > 0
tab math_fav
label var math_fav "Favorite Subject: Math"

* Favorite Sport (binarize by whether or not they mention football or soccer)
tab sport, sort
gen football_fav = strpos(lower(sport), "soccer") > 0
replace football_fav = 1 if strpos(lower(sport), "football") > 0
tab football_fav
label var football_fav "Favorite Sport: Football or Soccer"

* Favorite Color (binarize by whether or not they mention blue)
tab color, sort
gen blue_fav = strpos(lower(color), "blue") > 0
tab blue_fav
label var blue_fav "Favorite Color: Blue"

* Favorite Movie (binarize by whether or not they mention any movie noted by at least 5 people)
tab movie if tag_worker == 1, sort
gen popmovie_fav = strpos(lower(movie), "titanic") > 0
replace popmovie_fav = 1 if strpos(lower(movie), "star wars") > 0
replace popmovie_fav = 1 if strpos(lower(movie), "shawshank") > 0
replace popmovie_fav = 1 if strpos(lower(movie), "avatar") > 0
replace popmovie_fav = 1 if strpos(lower(movie), "inception") > 0
replace popmovie_fav = 1 if strpos(lower(movie), "rings") > 0
replace popmovie_fav = 1 if strpos(lower(movie), "matrix") > 0
replace popmovie_fav = 1 if strpos(lower(movie), "princess bride") > 0
tab popmovie_fav if tag_worker==1
label var popmovie_fav "Favorite Movie: Popular Movie"

* Demographics of Employer
gen educ_college_employer=1 
replace educ_college_employer=0 if education_employer=="Some College" | ///
  education_employer=="Vocational/Technical School (2 year)" | ///
  education_employer=="High School or equivalent" | ///
  education_employer=="Less than High School"
label var educ_college_employer "College Education or Above"

* Divide survey duration variables so they are in minutes instead of seconds
for X in any worker employer: replace duration_X=duration_X/60
label var duration_worker "Survey 1 Duration (Minutes)"
label var duration_employer "Survey 2 Duration (Minutes)"

label var incentivized "Predictions Incentivized"
gen hiringtask_number=1 if order<=20
replace hiringtask_number=2 if order>20 & order<=30
label var hiringtask_number "Hiring Task Number (1 = Before Info)"
label var pred_female "Prediction for Female Workers"
label var pred_male "Prediction for Male Workers"
label var pred_india "Prediction for Indian Workers"
label var pred_us "Prediction for US Workers"
label var pred_old "Prediction for Over Workers"
label var pred_young "Prediction for Under 33 Workers"

save "${clean}inaccurate_discrimination.dta", replace

****************************************************************************************************
*  2. Main Paper Tables and Figures
****************************************************************************************************

use "${clean}inaccurate_discrimination.dta", clear

* Table 2: Wages & Productivities for Hiring Task 1
foreach X in female india old {
    foreach Y in wage score {
        ttest `Y' if hiringtask==1, by(`X'_worker)
        local `Y'_mean0_`X'=r(mu_1)
        local `Y'_mean1_`X'=r(mu_2)
        local `Y'_sd0_`X'=r(sd_1)
        local `Y'_sd1_`X'=r(sd_2)
        local `Y'_N0_`X'=r(N_1)
        local `Y'_N1_`X'=r(N_2)
		reghdfe `Y' `X'_worker if hiringtask==1, cluster(id_employer id_worker) noabsorb
	    mat mytable = r(table)
        local `Y'_diff_`X'=-1*mytable[1,1]
        local `Y'_p_`X'=mytable[4,1]
        
        local `Y'_mean0_`X': di %6.2f ``Y'_mean0_`X'' 
        local `Y'_sd0_`X': di %6.2f ``Y'_sd0_`X'' 
        local `Y'_N0_`X': di %12.0gc ``Y'_N0_`X'' 
        local `Y'_mean1_`X': di %6.2f ``Y'_mean1_`X'' 
        local `Y'_sd1_`X': di %6.2f ``Y'_sd1_`X'' 
        local `Y'_N1_`X': di %12.0gc ``Y'_N1_`X'' 
        local `Y'_p_`X': di %6.2f ``Y'_p_`X'' 
        local `Y'_diff_`X': di %6.2f ``Y'_diff_`X'' 
    }
}
global label_female "Gender (1 = Male, 2 = Female)"
global label_india "Country (1 = US, 2 = India)"
global label_old "Age (1 = Under 33, 2 = Over 33)"

texdoc init "${tables}table2.tex", replace
tex {\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}
tex \begin{tabular}{l*{6}{c}}
tex \hline\hline
tex &\textbf{Group 1}& \textbf{Group 2}&\textbf{Diff.} &\textbf{p-val} & \textbf{\#Obs. G1}  & \textbf{\#Obs. G2}\\
tex &\multicolumn{1}{c}{(1)}&\multicolumn{1}{c}{(2)}&\multicolumn{1}{c}{(3)}& ///
  \multicolumn{1}{c}{(4)}&\multicolumn{1}{c}{(5)}&\multicolumn{1}{c}{(6)}\\
tex \hline
tex \multicolumn{6}{l}{\textbf{Panel A: Employers' Wage WTP, by Employee Characteristics}}\\      
foreach X in female india old {
    tex ${label_`X'} & `wage_mean0_`X'' & `wage_mean1_`X'' & `wage_diff_`X'' & `wage_p_`X'' ///
      & `wage_N0_`X'' & `wage_N1_`X'' \\
    tex              & (`wage_sd0_`X'') & (`wage_sd1_`X'') &            &         &          &   \\
}   
tex &&&&&&\\                                             
tex \multicolumn{6}{l}{\textbf{Panel B: Employee Productivity, by Employee Characteristics}}\\         
foreach X in female india old {
    tex ${label_`X'} & `score_mean0_`X'' & `score_mean1_`X'' & `score_diff_`X'' & `score_p_`X'' ///
      & `score_N0_`X'' & `score_N1_`X'' \\
    tex              & (`score_sd0_`X'') & (`score_sd1_`X'') &            &         &          &  \\
}   
tex \hline\hline
tex \end{tabular}}    
texdoc close

* Table 3: Beliefs about Productivity by Employee Characteristics
ttest pred_male=pred_female if tag_employer==1
local pred_mean0_female=r(mu_1)
local pred_mean1_female=r(mu_2)
local pred_sd0_female=r(sd_1)
local pred_sd1_female=r(sd_2)
local pred_diff_female=r(mu_1)-r(mu_2)
local pred_p_female=r(p)

ttest pred_us=pred_india if tag_employer==1
local pred_mean0_india=r(mu_1)
local pred_mean1_india=r(mu_2)
local pred_sd0_india=r(sd_1)
local pred_sd1_india=r(sd_2)
local pred_diff_india=r(mu_1)-r(mu_2)
local pred_p_india=r(p)

ttest pred_young=pred_old if tag_employer==1
local pred_mean0_old=r(mu_1)
local pred_mean1_old=r(mu_2)
local pred_sd0_old=r(sd_1)
local pred_sd1_old=r(sd_2)
local pred_diff_old=r(mu_1)-r(mu_2)
local pred_p_old=r(p)

foreach X in female india old {
    foreach Y in mean0 mean1 sd0 sd1 diff p{
        local pred_`Y'_`X': di %6.2f `pred_`Y'_`X'' 
    } 
}

texdoc init "${tables}table3.tex", replace  
tex {\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}
tex \begin{tabular}{l*{4}{c}}
tex \hline\hline
tex &\textbf{Group 1}&\textbf{Group 2}&\textbf{Diff.} &\textbf{p-val}\\
tex & (1) & (2) & (3) & (4)\\
tex \hline
foreach X in female india old {
    tex ${label_`X'} & `pred_mean0_`X'' & `pred_mean1_`X'' & `pred_diff_`X'' & `pred_p_`X'' \\
    tex              & (`pred_sd0_`X'') & (`pred_sd1_`X'') &            &            \\
}   
tex \hline\hline
tex \end{tabular}}    
texdoc close

* Table 4: Effect of Information - (Difference-in-Differences by Hiring Task)
gen postinfo=hiringtask==2
for Y in any female_worker india_worker old_worker: gen postinfoXY=postinfo*Y

eststo M1: reghdfe wage female_worker postinfo postinfoXfemale_worker, ///
  cluster(id_employer id_worker) noa
 sum wage if e(sample) & female_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M2: reghdfe wage india_worker postinfo postinfoXindia_worker, ///
  cluster(id_employer id_worker) noa
 sum wage if e(sample) & india_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M3: reghdfe wage old_worker postinfo postinfoXold_worker, cluster(id_employer id_worker) noa
 sum wage if e(sample) & old_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M4: reghdfe wage female_worker india_worker old_worker postinfo ///
   postinfoXfemale_worker postinfoXindia_worker postinfoXold_worker, ///
   cluster(id_employer id_worker) noa
 sum wage if e(sample) & female_worker==0 & india_worker==0 & old_worker==0 ///
   & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M5: reghdfe wage female_worker india_worker old_worker postinfo ///
   postinfoXfemale_worker postinfoXindia_worker postinfoXold_worker, ///
   abs(id_employer) cluster(id_employer id_worker)
 sum wage if e(sample) & female_worker==0 & india_worker==0 & old_worker==0 ///
   & hiringtask==1
 estadd scalar DepVarMean=r(mean)

#d ;
esttab M1 M2 M3 M4 M5 using "${tables}table4.tex", replace 
  keep(postinfo female_worker postinfoXfemale_worker india_worker postinfoXindia_worker 
  old_worker postinfoXold_worker)
  order(postinfo female_worker postinfoXfemale_worker india_worker postinfoXindia_worker 
  old_worker postinfoXold_worker)
  coeflabel(postinfo "Post-Info" female_worker "Female" postinfoXfemale_worker "Female X Post-Info"
  india_worker "Indian" postinfoXindia_worker "Indian X Post-Info" old_worker "Over 33"
  postinfoXold_worker "Over 33 X Post-Info") cells(b(star fmt(2)) se(par fmt(2))) 
  legend sty(fixed) nomtitles
  star(* 0.10 ** 0.05 *** 0.01) stat(N r2 DepVarMean, fmt(%12.0gc 2 2) label("N" "\$R^2$"))  
  postfoot(`"Employer FE? &No&No&No&No&Yes \\"'
  `"\hline\hline"' `"\bottomrule"' `"\multicolumn{6}{l}{\footnotesize \sym{*} \(p<0.10\),
  \sym{**} \(p<0.05\), \sym{***} \(p<0.01\)} \\"' `"\end{tabular}"' `"}"') label noabbrev ;
#d cr

****************************************************************************************************
*  3. Appendix Tables and Figures
****************************************************************************************************

* Figure B1: Productivity Distributions by Group
twoway kdensity score if female_worker==1, bwidth(1) lwidth(thick) || kdensity score ///
  if female_worker==0, bwidth(1) lpattern(-) lwidth(thick) ///
  legend(label(1 "Female") label(2 "Male")) ///
  graphregion(fcolor(white) lcolor(white)) bgcolor(white) ytitle("Kernal Density") ///
  xtitle("Trivia Score") name(female, replace) nodraw

twoway kdensity score if india_worker==1, bwidth(1) lwidth(thick) || kdensity score ///
  if india_worker==0, bwidth(1)  lpattern(-) lwidth(thick) ///
  legend(label(1 "Indian") label(2 "American")) ///
  graphregion(fcolor(white) lcolor(white)) bgcolor(white) ytitle("Kernal Density") ///
  xtitle("Trivia Score") name(india, replace) nodraw

twoway kdensity score if old_worker==1, bwidth(1) lwidth(thick) || kdensity score ///
  if old_worker==0, bwidth(1)  lpattern(-) lwidth(thick) ///
  legend(label(1 "Over 33") label(2 "Under 33")) ///
  graphregion(fcolor(white) lcolor(white)) bgcolor(white) ytitle("Kernal Density") ///
  xtitle("Trivia Score") name(old, replace) nodraw
graph combine female india old, col(3) iscale(*.75) graphregion(fcolor(white) lcolor(white))
graph export "${figures}figureB1.png", replace

* Figure B2: Productivity Distributions by Group
gen MF_diff = pred_male-pred_female
gen UI_diff = pred_us-pred_india
gen OY_diff = pred_old-pred_young
twoway kdensity MF_diff if tag_employer==1, bwidth(1) lwidth(thick) legend(off) ///
  graphregion(fcolor(white) lcolor(white)) bgcolor(white) ytitle("Kernal Density") ///
  xtitle("Beliefs About Men vs. Women") name(female, replace) xline(0) nodraw
twoway kdensity UI_diff if tag_employer==1, bwidth(1) lwidth(thick) legend(off) ///
  graphregion(fcolor(white) lcolor(white)) bgcolor(white) ytitle("Kernal Density") ///
  xtitle("Beliefs About American vs. Indian") name(india, replace) xline(0) nodraw
twoway kdensity OY_diff if tag_employer==1, bwidth(1) lwidth(thick) legend(off) ///
  graphregion(fcolor(white) lcolor(white)) bgcolor(white) ytitle("Kernal Density") ///
  xtitle("Beliefs About Over 33 vs. Under 33") name(old, replace) xline(0) nodraw
graph combine female india old, col(3) iscale(*.75) graphregion(fcolor(white) lcolor(white))
graph export "${figures}figureB2.png", replace

* Table B1: Summary Statistics
global summary_worker "score duration_worker preftea_worker age_worker female_worker india_worker"
global summary_employer "duration_employer educ_college_employer age_employer female_employer india_employer"

* NOTE: For some reason this first part sometimes gets dropped from the output when running the full
*       do file. When pasting in these lines manually it works fine. If the rerun is missing
*       the Worker Male/Female split from tableB1.tex, then paste/run these lines manually in Stata
estpost tabstat $summary_worker if tag_worker==1, by(female_worker) statistics(mean sd N) ///
  columns(statistics)
esttab using "${tables}tableB1.tex", main(mean %8.2f) aux(sd %8.2f) label nostar unstack nomtitle nonumber replace
est clear

estpost tabstat $summary_worker if tag_worker==1, by(india_worker) statistics(mean sd N) ///
  columns(statistics)
esttab using "${tables}tableB1.tex", main(mean %8.2f) aux(sd %8.2f) label nostar unstack nomtitle nonumber append
est clear

estpost tabstat $summary_worker if tag_worker==1, by(old_worker) statistics(mean sd N) ///
  columns(statistics)
esttab using "${tables}tableB1.tex", main(mean %8.2f) aux(sd %8.2f) label nostar unstack nomtitle nonumber append
est clear

estpost tabstat $summary_employer if tag_employer==1, by(female_employer) statistics(mean sd N) ///
  columns(statistics)
esttab using "${tables}tableB1.tex", main(mean %8.2f) aux(sd %8.2f) label nostar unstack nomtitle nonumber append
est clear

estpost tabstat $summary_employer if tag_employer==1, by(india_employer) statistics(mean sd N) ///
  columns(statistics)
esttab using "${tables}tableB1.tex", main(mean %8.2f) aux(sd %8.2f) label nostar unstack nomtitle nonumber append
est clear

estpost tabstat $summary_employer if tag_employer==1, by(old_employer) statistics(mean sd N) ///
  columns(statistics)
esttab using "${tables}tableB1.tex", main(mean %8.2f) aux(sd %8.2f) label nostar unstack nomtitle nonumber append
est clear

* Table B2: Discrimination in Wages, by Employee Characteristics (Hiring Task 1)
eststo M1: reghdfe wage female_worker if hiringtask==1, cluster(id_employer id_worker) noa
 sum wage if e(sample) & female_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M2: reghdfe wage india_worker if hiringtask==1, cluster(id_employer id_worker) noa
 sum wage if e(sample) & india_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M3: reghdfe wage old_worker if hiringtask==1, cluster(id_employer id_worker) noa
 sum wage if e(sample) & old_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M4: reghdfe wage female_worker india_worker old_worker if hiringtask==1, ///
  cluster(id_employer id_worker) noa
 sum wage if e(sample) & female_worker==0 & india_worker==0 & old_worker==0 ///
   & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M5: reghdfe wage female_worker india_worker old_worker if hiringtask==1, ///
  abs(id_employer) cluster(id_employer id_worker)
 sum wage if e(sample) & female_worker==0 & india_worker==0 & old_worker==0 ///
   & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M6: reghdfe wage female_worker india_worker old_worker preftea_worker if hiringtask==1, ///
  abs(id_employer) cluster(id_employer id_worker)
 sum wage if e(sample) & female_worker==0 & india_worker==0 & old_worker==0 ///
   & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M7: reghdfe wage female_worker india_worker old_worker math_fav if hiringtask==1, ///
  abs(id_employer) cluster(id_employer id_worker)
 sum wage if e(sample) & female_worker==0 & india_worker==0 & old_worker==0 ///
   & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M8: reghdfe wage female_worker india_worker old_worker preftea_worker math_fav ///
  blue_fav football_fav popmovie_fav if hiringtask==1, ///
  abs(id_employer) cluster(id_employer id_worker)
 sum wage if e(sample) & female_worker==0 & india_worker==0 & old_worker==0 ///
   & hiringtask==1
 estadd scalar DepVarMean=r(mean)


#d ;
esttab M1 M2 M3 M4 M5 M6 M7 M8 using "${tables}tableB2.tex", replace 
  keep(female_worker india_worker old_worker preftea_worker math_fav blue_fav 
  football_fav popmovie_fav) coeflabel(female_worker "Female" india_worker "Indian" 
  old_worker "Over 33" preftea_worker "Prefers Tea" math_fav "Fav Subject: Math" 
  blue_fav "Fav Color: Blue" football_fav "Fav Sport: Football" popmovie_fav "Fav Movie: Popular")
  cells(b(star fmt(2)) se(par fmt(2))) legend sty(fixed) nomtitles
  star(* 0.10 ** 0.05 *** 0.01) stat(N r2 DepVarMean, fmt(%12.0gc 2 2) label("N" "\$R^2$"))  
  postfoot(`"Employer FE? &No&No&No&No&Yes&Yes&Yes&Yes \\"'
  `"\hline\hline"' `"\bottomrule"' `"\multicolumn{9}{l}{\footnotesize \sym{*} \(p<0.10\),
  \sym{**} \(p<0.05\), \sym{***} \(p<0.01\)} \\"' `"\end{tabular}"' `"}"') label noabbrev ;
#d cr


  
* Table B3: In-Group Bias Test (Hiring Task 1)
for Y in any female india old: gen Y_match=Y_worker*Y_employer

eststo M1: reghdfe wage female_worker female_employer female_match, cluster(id_employer id_worker) noa
 sum wage if e(sample) & female_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M2: reghdfe wage india_worker india_employer india_match, cluster(id_employer id_worker) noa
 sum wage if e(sample) & india_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M3: reghdfe wage old_worker old_employer old_match, cluster(id_employer id_worker) noa
 sum wage if e(sample) & old_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)
eststo M4: reghdfe wage female_worker female_employer female_match india_worker india_employer ///
 india_match old_worker old_employer old_match, cluster(id_employer id_worker) noa
 sum wage if e(sample) & old_worker==0 & hiringtask==1
 estadd scalar DepVarMean=r(mean)

#d ;
esttab M1 M2 M3 M4 using "${tables}tableB3.tex", replace 
  keep(female_worker female_employer female_match india_worker india_employer india_match 
  old_worker old_employer old_match)
  order(female_worker female_employer female_match india_worker india_employer india_match 
  old_worker old_employer old_match)
  coeflabel(female_worker "Female Worker" female_employer "Female Employer" female_match 
  "Female Worker X Employer" india_worker "Indian Worker" india_employer "Indian Employer" 
  india_match "Indian Worker X Employer" old_worker "Over 33 Worker" old_employer "Over 33 Employer"
  old_match "Over 33 Worker X Employer") cells(b(star fmt(2)) se(par fmt(2))) legend sty(fixed) 
  nomtitles star(* 0.10 ** 0.05 *** 0.01) stat(N r2 DepVarMean, fmt(%12.0gc 2 2) 
  label("N" "\$R^2$"))  postfoot(`"\hline\hline"' `"\bottomrule"' 
  `"\multicolumn{5}{l}{\footnotesize \sym{*} \(p<0.10\),
  \sym{**} \(p<0.05\), \sym{***} \(p<0.01\)} \\"' `"\end{tabular}"' `"}"') label noabbrev ;
#d cr

* Table B4: In-Group vs. Out-Group Beliefs about Productivity by Employee Characteristics
gen male_employer=abs(female_employer-1)
gen us_employer=abs(india_employer-1)
gen young_employer=abs(old_employer-1)

foreach X in female male india us old young {
    qui ttest pred_`X' if tag_employer==1, by(`X'_employer)
    local mean0_`X'=r(mu_1)
    local mean1_`X'=r(mu_2)
    local sd0_`X'=r(sd_1)
    local sd1_`X'=r(sd_2)
    local N0_`X'=r(N_1)
    local N1_`X'=r(N_2)
    local diff_`X'=r(mu_1)-r(mu_2)
    local p_`X'=r(p)
    
    local mean0_`X': di %6.2f `mean0_`X'' 
    local sd0_`X': di %6.2f `sd0_`X'' 
    local N0_`X': di %12.0gc `N0_`X'' 
    local mean1_`X': di %6.2f `mean1_`X'' 
    local sd1_`X': di %6.2f `sd1_`X'' 
    local N1_`X': di %12.0gc `N1_`X'' 
    local p_`X': di %6.2f `p_`X'' 
    local diff_`X': di %6.2f `diff_`X''   
}
global label_female "Prediction for Female Workers"
global label_male "Prediction for Male Workers"
global label_india "Prediction for Indian Workers"
global label_us "Prediction for US Workers"
global label_old "Prediction for Over 33 Workers"
global label_young "Prediction for Under 33 Workers"

texdoc init "${tables}tableB4.tex", replace
tex {\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}
tex \begin{tabular}{l*{6}{c}}
tex \hline\hline
tex &\textbf{Out}&\textbf{In}&\textbf{Diff.}&\textbf{p-val}&\textbf{\#Obs.}&\textbf{\#Obs.}\\
tex &\textbf{Group}&\textbf{Group}&&&\textbf{Out}&\textbf{In}  \\
tex &\multicolumn{1}{c}{(1)}&\multicolumn{1}{c}{(2)}&\multicolumn{1}{c}{(3)}& ///
  \multicolumn{1}{c}{(4)}&\multicolumn{1}{c}{(5)}&\multicolumn{1}{c}{(6)}\\
tex \hline
foreach X in female male india us old young {
    tex ${label_`X'} & `mean0_`X'' & `mean1_`X'' & `diff_`X'' & `p_`X'' ///
      & `N0_`X'' & `N1_`X'' \\
    tex & (`sd0_`X'') & (`sd1_`X'') &            &         &          &   \\
}   
tex \hline\hline
tex \end{tabular}}    
texdoc close
drop male_employer us_employer young_employer

* Predictions about group difference/gaps, by own group identity (as discussed in the text)
gen pred_gendergap = pred_male - pred_female
gen pred_nationgap = pred_us - pred_india 
gen pred_agegap = pred_young - pred_old
ttest pred_gendergap if tag_employer==1, by(female_employer)
ttest pred_nationgap if tag_employer==1, by(india_employer)
ttest pred_agegap if tag_employer==1, by(old_employer)
drop pred_gendergap pred_nationgap pred_agegap

* Table B5: Effects of Large Incentives for Accurate Predictions
*   NOTE: Requires combining the two files into one for tableB5.tex
global predictions "pred_female pred_male pred_india pred_us pred_old pred_young"
eststo sumstat: estpost tabstat $predictions if tag_employer==1, by(incentivized) ///
  col(stats) stats(mean sd N)
esttab sumstat using "${tables}tableB5_sumstats.tex", replace main(mean) aux(sd) nostar ///
  unstack nonote label cells(mean(fmt(%6.2f)) sd(fmt(%6.2f) par))
eststo ttest: estpost ttest $predictions if tag_employer==1, by(incentivized)
esttab ttest using "${tables}tableB5_diff.tex", replace wide cells((b(fmt(%6.2f) star) ///
  p(fmt(2)))) nonote label refcat("\textbf{Predictions by Worker Characteristics}", nolabel) ///
  varlabels(pred_female "1" pred_male "2" pred_india "3"  pred_us "4" pred_old ///
  "5" pred_young "6") star(* 0.10 ** 0.05 *** 0.01)

* Table B6: Beliefs about Productivity by Employee Characteristics, Trimmed
for X in any MF UI OY: winsor2 X_diff if tag_employer==1, cuts(5 95) trim
ttest pred_male=pred_female if tag_employer==1 & MF_diff_tr != .
local pred_mean0_female=r(mu_1)
local pred_mean1_female=r(mu_2)
local pred_sd0_female=r(sd_1)
local pred_sd1_female=r(sd_2)
local pred_diff_female=r(mu_1)-r(mu_2)
local pred_p_female=r(p)

ttest pred_us=pred_india if tag_employer==1 & UI_diff_tr != .
local pred_mean0_india=r(mu_1)
local pred_mean1_india=r(mu_2)
local pred_sd0_india=r(sd_1)
local pred_sd1_india=r(sd_2)
local pred_diff_india=r(mu_1)-r(mu_2)
local pred_p_india=r(p)

ttest pred_young=pred_old if tag_employer==1 & OY_diff_tr != .
local pred_mean0_old=r(mu_1)
local pred_mean1_old=r(mu_2)
local pred_sd0_old=r(sd_1)
local pred_sd1_old=r(sd_2)
local pred_diff_old=r(mu_1)-r(mu_2)
local pred_p_old=r(p)

foreach X in female india old {
    foreach Y in mean0 mean1 sd0 sd1 diff p{
        local pred_`Y'_`X': di %6.2f `pred_`Y'_`X'' 
    } 
}

global label_female "Gender (1 = Male, 2 = Female)"
global label_india "Country (1 = US, 2 = India)"
global label_old "Age (1 = Under 33, 2 = Over 33)"

texdoc init "${tables}tableB6.tex", replace  
tex {\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}
tex \begin{tabular}{l*{4}{c}}
tex \hline\hline
tex &\textbf{Group 1}&\textbf{Group 2}&\textbf{Diff.} &\textbf{p-val}\\
tex & (1) & (2) & (3) & (4)\\
tex \hline
foreach X in female india old {
    tex ${label_`X'} & `pred_mean0_`X'' & `pred_mean1_`X'' & `pred_diff_`X'' & `pred_p_`X'' \\
    tex              & (`pred_sd0_`X'') & (`pred_sd1_`X'') &            &            \\
}   
tex \hline\hline
tex \end{tabular}}    
texdoc close

****************************************************************************************************
*  4. Additional Stats
****************************************************************************************************

* Correlate beliefs & wages. In the text this is just before Table 3: 
*    "We find positive correlations for all six groups of workers (Female: 0.12, Male: 0.12,
*     India: 0.15, U.S.: 0.12, Over 33: 0.12, Under 33: 0.10)"
corr wage pred_female if female_worker==1
corr wage pred_male if female_worker==0
corr wage pred_india if india_worker==1
corr wage pred_us if india_worker==0
corr wage pred_old if old_worker==1
corr wage pred_young if old_worker==0

* Balance Test for Incentives Treatment. In the text this is in the table notes for Table B4:
*    "The joint f-statistic from regression of an indicator for the ``Incentivized'' treatment on 
*     set of employer observable characteristics in \cref{Table_SummaryStatistics}, Panel B 
*     (duration, education, age, female, from India) is 1.31 (p=0.260)."
reg incentivized duration_employer educ_college_employer age_employer female_employer ///
  india_employer if tag_employer==1
testparm*
disp r(p)

* First 10 profiles seen in Hiring Task 1 not diff than second 10 profiles in Hiring Task 1
* In the text this is in footnote 35:
*    "To investigate this channel, we perform a test comparing the average wages assigned in the 
*     first 10 profiles and the second 10 profiles during the initial task. We do not find evidence 
*     for an experience effect (36.86 vs. 36.72; p=.39)"
gen first10=order<=10
ttest score if hiringtask_number==1, by(first10)

log close

stop
